Data: read counts table from both runs
rRNAs_2 <- read.table(
file = "rRNA_counts/20190308_sample2_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_3 <- read.table(
file = "rRNA_counts/20190308_sample3_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_4 <- read.table(
file = "rRNA_counts/20190308_sample4_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_5 <- read.table(
file = "rRNA_counts/20190308_sample5_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_6 <- read.table(
file = "rRNA_counts/20190308_sample6_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_7 <- read.table(
file = "rRNA_counts/20190308_sample7_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_8 <- read.table(
file = "rRNA_counts/20190308_sample8_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_9 <- read.table(
file = "rRNA_counts/20190308_sample9_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_11 <- read.table(
file = "rRNA_counts/20190308_sample11_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_14 <- read.table(
file = "rRNA_counts/20190308_sample14_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
merged_rrna_counts_run3 <- cbind(rRNAs_2, rRNAs_3, rRNAs_4, rRNAs_5, rRNAs_6, rRNAs_7, rRNAs_8, rRNAs_9, rRNAs_11, rRNAs_14)
colnames(merged_rrna_counts_run3) <- c(
"MS28F1_sample2", "MS28F1_sample3",
"MS28F1_sample4", "MS28F1_sample5",
"MS28F1_sample6", "MS28F1_sample7",
"MS28F1_sample8", "MS28F1_sample9",
"MS28F1_sample11", "MS28F1_sample14"
)
rRNAs_2 <- read.table(
file = "rRNA_counts/20190110_sample2_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_3 <- read.table(
file = "rRNA_counts/20190110_sample3_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_4 <- read.table(
file = "rRNA_counts/20190110_sample4_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_5 <- read.table(
file = "rRNA_counts/20190110_sample5_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_6 <- read.table(
file = "rRNA_counts/20190110_sample6_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_7 <- read.table(
file = "rRNA_counts/20190110_sample7_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_8 <- read.table(
file = "rRNA_counts/20190110_sample8_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_9 <- read.table(
file = "rRNA_counts/20190110_sample9_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_11 <- read.table(
file = "rRNA_counts/20190110_sample11_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
rRNAs_14 <- read.table(
file = "rRNA_counts/20190110_sample14_rrna_counts.txt",
header = TRUE, sep = "", row.names = 1
)
merged_rrna_counts_run4 <- cbind(rRNAs_2, rRNAs_3, rRNAs_4, rRNAs_5, rRNAs_6, rRNAs_7, rRNAs_8, rRNAs_9, rRNAs_11, rRNAs_14)
colnames(merged_rrna_counts_run4) <- c(
"MS28F1_sample2", "MS28F1_sample3",
"MS28F1_sample4", "MS28F1_sample5",
"MS28F1_sample6", "MS28F1_sample7",
"MS28F1_sample8", "MS28F1_sample9",
"MS28F1_sample11", "MS28F1_sample14"
)
df <- merge(merged_rrna_counts_run3, merged_rrna_counts_run4, by = "row.names")
rownames(df) <- df$Row.names
df <- df[, !(names(df) %in% "Row.names")]
knitr::kable(head(df))
| Gm22076 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| Gm22093 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
| Gm22094 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
2 |
0 |
0 |
0 |
0 |
| Gm22135 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
1 |
2 |
0 |
1 |
0 |
0 |
2 |
0 |
| Gm22164 |
0 |
0 |
2 |
0 |
0 |
3 |
2 |
3 |
0 |
0 |
1 |
3 |
0 |
0 |
1 |
2 |
5 |
3 |
2 |
0 |
| Gm22216 |
0 |
0 |
4 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
0 |
0 |
0 |
0 |
0 |
Specifying the group & batch while filterByExprs()
counts_total <- df
d <- data.frame(
row.names = colnames(counts_total),
group = c(
"MSUS", "Control", "MSUS", "Control", "MSUS", "Control",
"MSUS", "Control", "Control", "MSUS", "MSUS", "Control",
"MSUS", "Control", "MSUS", "Control", "MSUS", "Control",
"Control", "MSUS"
)
)
counts_total[is.na(counts_total)] <- 0
dds <- edgeR::calcNormFactors(DGEList(counts_total), method = "TMM")
dds$samples$group <- c(
"MSUS", "Control", "MSUS", "Control", "MSUS", "Control",
"MSUS", "Control", "Control", "MSUS", "MSUS", "Control",
"MSUS", "Control", "MSUS", "Control", "MSUS", "Control",
"Control", "MSUS"
)
dds$samples$batch <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
mm <- model.matrix(~ dds$samples$batch + dds$samples$group, data = d)
dds <- dds[which(filterByExpr(dds, design = mm, min.count = 10, min.prop = 0.9)), ]
en <- log1p(edgeR::cpm(dds)) # normalization by counts per million
PCA plots of normalized counts-per-million
All samples (normalized by counts-per-million), colored by group and shaped by batched
plgINS::plPCA(en,
colorBy = dds$samples$group, add.labels = FALSE,
points.size = 12, shapeBy = paste0("Batch", dds$samples$batch)
)
All samples (normalized by counts-per-million), colored by library size and shaped by group
plgINS::plPCA(en,
colorBy = dds$samples$lib.size, add.labels = FALSE,
points.size = 12, shapeBy = dds$samples$group
)
All samples (normalized by counts-per-million), colored by batch
Note: the samples are clearly separated by sequencing batch & therefore batch correction is required.
plgINS::plPCA(en,
colorBy = dds$samples$batch,
add.labels = FALSE, points.size = 20
)
Differential expression analysis without batch-correction
dds <- estimateDisp(dds, mm)
fit <- glmLRT(glmFit(dds, mm), coef = "dds$samples$groupMSUS")
res_without_ruvseq <- as.data.frame(topTags(fit, Inf))
knitr::kable(head(res_without_ruvseq, n = 30))
| n-R5s210 |
0.4308871 |
9.981542 |
4.0389801 |
0.0444607 |
0.3112248 |
| Gm24601 |
-0.1900863 |
17.181352 |
1.8416038 |
0.1747631 |
0.5517324 |
| n-R5s41 |
0.3309331 |
10.888179 |
0.8553845 |
0.3550335 |
0.5517324 |
| n-R5s5_n-R5s52 |
-0.1167098 |
14.485231 |
0.5615409 |
0.4536401 |
0.5517324 |
| n-R5-8s1 |
-0.1480348 |
19.863887 |
0.5323830 |
0.4656067 |
0.5517324 |
| n-R5s194 |
0.1049421 |
11.669698 |
0.4539355 |
0.5004720 |
0.5517324 |
| n-R5s151 |
0.0837534 |
14.378569 |
0.3542227 |
0.5517324 |
0.5517324 |
write_csv(res_without_ruvseq, path = "res_without_ruvseq.csv")
Differential expression analysis with RUVSeq, SV=2 (batch correction with Surrogate Variable analysis)
re <- RUVSeq::RUVs(en,
cIdx = row.names(en), k = 1,
scIdx = RUVSeq::makeGroups(d$group), isLog = TRUE
)
d$SV1 <- re$W[, 1]
mm <- model.matrix(~ SV1 + dds$samples$batch + dds$samples$group, data = d)
dds <- estimateDisp(dds, mm)
fit <- glmLRT(glmFit(dds, mm), coef = "dds$samples$groupMSUS")
res_with_ruvseq1 <- as.data.frame(topTags(fit, Inf))
knitr::kable(head(res_with_ruvseq1, n = 100))
| n-R5s210 |
0.4555366 |
9.982874 |
4.5966138 |
0.0320352 |
0.2242462 |
| Gm24601 |
-0.2365131 |
17.181338 |
2.8253213 |
0.0927883 |
0.3247591 |
| n-R5-8s1 |
-0.2717411 |
19.863922 |
1.9400793 |
0.1636599 |
0.3482770 |
| n-R5s5_n-R5s52 |
-0.1856209 |
14.483958 |
1.6495848 |
0.1990155 |
0.3482770 |
| n-R5s194 |
0.0853779 |
11.673358 |
0.3130548 |
0.5758117 |
0.6368089 |
| n-R5s151 |
0.0726863 |
14.378230 |
0.2550845 |
0.6135175 |
0.6368089 |
| n-R5s41 |
0.0765466 |
10.896399 |
0.2229400 |
0.6368089 |
0.6368089 |
write_csv(res_with_ruvseq1, path = "res_with_ruvseq1.csv")
PCA plots after correction with 1 Surrogate Variable
All samples, colored by treatment group
plgINS::plPCA(re$normalizedCounts,
colorBy = d$group,
add.labels = FALSE, points.size = 20
)
All samples, colored by library size
plgINS::plPCA(re$normalizedCounts,
colorBy = dds$samples$lib.size,
add.labels = FALSE, points.size = 20
)
All samples, colored by sequencing batch
Note: samples are still separated on PCA by batch, therefore additional SV is necessary and run for the analysis below.
plgINS::plPCA(re$normalizedCounts,
colorBy = dds$samples$batch,
add.labels = FALSE, points.size = 20
)
PCA plots correction with 2 Surrogate Variables
All samples, colored by treatment group
re <- RUVSeq::RUVs(en,
cIdx = row.names(en), k = 2,
scIdx = RUVSeq::makeGroups(d$group), isLog = TRUE
)
d$SV1 <- re$W[, 1]
d$SV2 <- re$W[, 2]
mm <- model.matrix(~ SV1 + d$SV2 + dds$samples$batch + dds$samples$group, data = d)
plgINS::plPCA(re$normalizedCounts,
colorBy = d$group,
add.labels = FALSE, points.size = 20
)
All samples, colored by library size
plgINS::plPCA(re$normalizedCounts,
colorBy = dds$samples$lib.size,
add.labels = FALSE, points.size = 20
)
All samples, colored by sequencing run
Note: The batch effect is corrected and the samples are not separated by sequencing batch. Thefore, The results of this differentially expression analysis are used for final conclusions:
plgINS::plPCA(re$normalizedCounts,
colorBy = dds$samples$batch,
add.labels = FALSE, points.size = 20
)
Differential expression analysis with RUVSeq, SV=2 (batch correction with Surrogate Variable analysis)
dds <- estimateDisp(dds, mm)
fit <- glmLRT(glmFit(dds, mm), coef = "dds$samples$groupMSUS")
res_with_ruvseq2 <- as.data.frame(topTags(fit, Inf))
knitr::kable(head(res_with_ruvseq2, n = 80))
| n-R5s5_n-R5s52 |
-0.2819005 |
14.480710 |
4.6587032 |
0.0308965 |
0.2162756 |
| n-R5s210 |
0.3709225 |
9.984833 |
3.0494747 |
0.0807633 |
0.2826714 |
| Gm24601 |
-0.1642624 |
17.181294 |
1.5319267 |
0.2158235 |
0.5035882 |
| n-R5-8s1 |
-0.1184372 |
19.864018 |
0.9220752 |
0.3369307 |
0.5896287 |
| n-R5s41 |
0.0718775 |
10.910322 |
0.2643030 |
0.6071792 |
0.7083085 |
| n-R5s194 |
-0.0590254 |
11.680319 |
0.2394775 |
0.6245837 |
0.7083085 |
| n-R5s151 |
-0.0432692 |
14.377380 |
0.1399724 |
0.7083085 |
0.7083085 |
write_csv(res_with_ruvseq2, path = "res_with_ruvseq2.csv")
References
report::cite_packages(sessionInfo())
## - Amezquita R, Lun A, Becht E, Carey V, Carpp L, Geistlinger L, Marini F,Rue-Albrecht K, Risso D, Soneson C, Waldron L, Pages H, Smith M, HuberW, Morgan M, Gottardo R, Hicks S (2020). "Orchestrating single-cellanalysis with Bioconductor." _Nature Methods_, *17*, 137-145. <URL:https://www.nature.com/articles/s41592-019-0654-x>.
## - Ben Bolstad (2021). preprocessCore: A collection of pre-processing functions. R package version 1.52.1. https://github.com/bmbolstad/preprocessCore
## - Charlotte Soneson, Michael I. Love, Mark D. Robinson (2015): Differential analyses for RNA-seq: transcript-level estimates improve gene-level inferences. F1000Research
## - Constantin Ahlmann-Eltze, Peter Hickey and Hervé Pagès (2021). MatrixGenerics: S4 Generic Summary Statistic Functions that Operate on Matrix-Like Objects. R package version 1.2.1. https://bioconductor.org/packages/MatrixGenerics
## - C. Sievert. Interactive Web-Based Data Visualization with R, plotly, and shiny. Chapman and Hall/CRC Florida, 2020.
## - D. Risso, J. Ngai, T. P. Speed and S. Dudoit (2014). Normalization of RNA-seq data using factor analysis of control genes or samples Nature Biotechnology, 32(9), 896-902
## - D. Risso, K. Schwartz, G. Sherlock and S. Dudoit (2011). GC-Content Normalization for RNA-Seq Data. BMC Bioinformatics 12:480
## - Erich Neuwirth (2014). RColorBrewer: ColorBrewer Palettes. R package version 1.1-2. https://CRAN.R-project.org/package=RColorBrewer
## - Gregory R. Warnes, Ben Bolker, Lodewijk Bonebakker, Robert Gentleman, Wolfgang Huber, Andy Liaw, Thomas Lumley, Martin Maechler, Arni Magnusson, Steffen Moeller, Marc Schwartz and Bill Venables (2020). gplots: Various R Programming Tools for Plotting Data. R package version 3.1.1. https://CRAN.R-project.org/package=gplots
## - Hadley Wickham (2007). Reshaping Data with the reshape Package. Journal of Statistical Software, 21(12), 1-20. URL http://www.jstatsoft.org/v21/i12/.
## - Hadley Wickham (2011). The Split-Apply-Combine Strategy for Data Analysis. Journal of Statistical Software, 40(1), 1-29. URL http://www.jstatsoft.org/v40/i01/.
## - Hadley Wickham and Dana Seidel (2020). scales: Scale Functions for Visualization. R package version 1.1.1. https://CRAN.R-project.org/package=scales
## - Hadley Wickham and Jennifer Bryan (2019). readxl: Read Excel Files. R package version 1.3.1. https://CRAN.R-project.org/package=readxl
## - Hadley Wickham and Jim Hester (2020). readr: Read Rectangular Text Data. R package version 1.4.0. https://CRAN.R-project.org/package=readr
## - Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2021). dplyr: A Grammar of Data Manipulation. R package version 1.0.5. https://CRAN.R-project.org/package=dplyr
## - Henrik Bengtsson (2021). matrixStats: Functions that Apply to Rows and Columns of Matrices (and to Vectors). R package version 0.58.0. https://CRAN.R-project.org/package=matrixStats
## - Hervé Pagès and Patrick Aboyoun (2020). XVector: Foundation of external vector representation and manipulation in Bioconductor. R package version 0.30.0. https://bioconductor.org/packages/XVector
## - Hervé Pagès, Marc Carlson, Seth Falcon and Nianhua Li (2020). AnnotationDbi: Manipulation of SQLite-based annotations in Bioconductor. R package version 1.52.0. https://bioconductor.org/packages/AnnotationDbi
## - Hoffman GE, Schadt EE (2016). variancePartition: Interpreting drivers of variation in complex gene expression studies. BMC Bioinformatics, 17:483, doi:10.1186/s12859-016-1323-z
## - H. Pagès, M. Lawrence and P. Aboyoun (2020). S4Vectors: Foundation of vector-like and list-like containers in Bioconductor. R package version 0.28.1. https://bioconductor.org/packages/S4Vectors
## - H. Pagès, P. Aboyoun, R. Gentleman and S. DebRoy (2020). Biostrings: Efficient manipulation of biological strings. R package version 2.58.0. https://bioconductor.org/packages/Biostrings
## - H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York, 2016.
## - Igor Dolgalev (2020). msigdbr: MSigDB Gene Sets for Multiple Organisms in a Tidy Data Format. R package version 7.2.1. https://CRAN.R-project.org/package=msigdbr
## - Kamil Slowikowski (2021). ggrepel: Automatically Position Non-Overlapping Text Labels with 'ggplot2'. R package version 0.9.1. https://CRAN.R-project.org/package=ggrepel
## - KD Hansen, RA Irizarry, and Z Wu, Removing technical variability in RNA-seq data using conditional quantile normalization. Biostatistics 2012 vol. 13(2) pp. 204-216.
## - Kevin Blighe, Sharmila Rana and Myles Lewis (2020). EnhancedVolcano: Publication-ready volcano plots with enhanced colouring and labeling. R package version 1.8.0. https://github.com/kevinblighe/EnhancedVolcano
## - Lawrence M, Huber W, Pag\`es H, Aboyoun P, Carlson M, et al. (2013) Software for Computing and Annotating Genomic Ranges. PLoS Comput Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118
## - Lawrence M, Huber W, Pag\`es H, Aboyoun P, Carlson M, et al. (2013) Software for Computing and Annotating Genomic Ranges. PLoS Comput Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118
## - Lawrence M, Huber W, Pag\`es H, Aboyoun P, Carlson M, et al. (2013) Software for Computing and Annotating Genomic Ranges. PLoS Comput Biol 9(8): e1003118. doi:10.1371/journal.pcbi.1003118
## - Liao Y, Smyth GK and Shi W (2019). The R package Rsubread is easier, faster, cheaper and better for alignment and quantification of RNA sequencing reads. Nucleic Acids Research 47(8), e47.
## - Love, M.I., Huber, W., Anders, S. Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2 Genome Biology 15(12):550 (2014)
## - Mapping identifiers for the integration of genomic datasets with the R/Bioconductor package biomaRt. Steffen Durinck, Paul T. Spellman, Ewan Birney and Wolfgang Huber, Nature Protocols 4, 1184-1191 (2009).
## - Marc Carlson (2020). org.Mm.eg.db: Genome wide annotation for Mouse. R package version 3.12.0.
## - Martin Maechler (2019). nor1mix: Normal aka Gaussian (1-d) Mixture Models (S3 Classes and Methods). R package version 1.3-0. https://CRAN.R-project.org/package=nor1mix
## - Martin Morgan, Hervé Pagès, Valerie Obenchain and Nathaniel Hayden (2021). Rsamtools: Binary alignment (BAM), FASTA, variant call (BCF), and tabix file import. R package version 2.7.1. https://bioconductor.org/packages/Rsamtools
## - Martin Morgan, Valerie Obenchain, Jim Hester and Hervé Pagès (2020). SummarizedExperiment: SummarizedExperiment container. R package version 1.20.0. https://bioconductor.org/packages/SummarizedExperiment
## - Martin Morgan, Valerie Obenchain, Michel Lang, Ryan Thompson and Nitesh Turaga (2020). BiocParallel: Bioconductor facilities for parallel evaluation. R package version 1.24.1. https://github.com/Bioconductor/BiocParallel
## - Michael I. Love, Charlotte Soneson, Peter F. Hickey, Lisa K. Johnson, N. Tessa Pierce, Lori Shepherd, Martin Morgan, Rob Patro Tximeta: Reference sequence checksums for provenance identification in RNA-seq PLOS Computational Biology
## - M. Morgan, S. Anders, M. Lawrence, P. Aboyoun, H. Pag\`es, and R. Gentleman (2009): "ShortRead: a Bioconductor package for input, quality assessment and exploration of high-throughput sequence data". Bioinformatics 25:2607-2608.
## - Orchestrating high-throughput genomic analysis with Bioconductor. W. Huber, V.J. Carey, R. Gentleman, ..., M. Morgan Nature Methods, 2015:12, 115.
## - Orchestrating high-throughput genomic analysis with Bioconductor. W. Huber, V.J. Carey, R. Gentleman, ..., M. Morgan Nature Methods, 2015:12, 115.
## - Raivo Kolde (2019). pheatmap: Pretty Heatmaps. R package version 1.0.12. https://CRAN.R-project.org/package=pheatmap
## - R Core Team (2021). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
## - Ritchie, M.E., Phipson, B., Wu, D., Hu, Y., Law, C.W., Shi, W., and Smyth, G.K. (2015). limma powers differential expression analyses for RNA-sequencing and microarray studies. Nucleic Acids Research 43(7), e47.
## - Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression analysis of digital gene expression data. Bioinformatics 26, 139-140
## - Roger Koenker (2021). quantreg: Quantile Regression. R package version 5.85. https://CRAN.R-project.org/package=quantreg
## - Roger Koenker (2021). SparseM: Sparse Linear Algebra. R package version 1.81. https://CRAN.R-project.org/package=SparseM
## - Scrucca L., Fop M., Murphy T. B. and Raftery A. E. (2016) mclust 5: clustering, classification and density estimation using Gaussian finite mixture models The R Journal 8/1, pp. 289-317
## - Simon Garnier, Noam Ross, Robert Rudis, Antônio P. Camargo, Marco Sciaini, and Cédric Scherer (2021). Rvision - Colorblind-Friendly Color Maps for R. R package version 0.4.0.
## - Simon Garnier, Noam Ross, Robert Rudis, Antônio P. Camargo, Marco Sciaini, and Cédric Scherer (2021). Rvision - Colorblind-Friendly Color Maps for R. R package version 0.6.0.
## - Sonali Arora, Martin Morgan, Marc Carlson and H. Pagès (2021). GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style. R package version 1.26.7. https://bioconductor.org/packages/GenomeInfoDb